City Taxi Rides Analysis

  • Imports
  • Load data
  • Create datetime cols
  • City Geodata
  • EDA
    • Number of rides per hour
    • Median number of miles per hour
    • Median fare per trip
    • Total revenue
    • Hourly revenue
    • Rides velocity
    • Revenue per minute
    • Trip destinations
  • Choropleth Map
    • Pickup choropleth
    • Dropoff choropleth
    • Matplotlib choropleth
    • City Grid
  • Adding grid to dropoffs
  • Grid Dropoff Choropleth
    • Adding Famous Places
    • Grid cells with more than 2000 dropoffs
  • Origin Destination Matrix
    • Create Grid fror ODM
    • Adjust CRS
    • sjoin grid cells
    • create ODM

Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from datetime import datetime, timedelta
import numpy as np
import plotly.offline as pyo
import plotly.graph_objs as go
from plotly import tools
pyo.init_notebook_mode(connected=True)
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
import geopandas as gpd
import folium
from shapely.geometry import Point
import shapely.wkt
from shapely.geometry import Polygon
import warnings
warnings.filterwarnings('ignore')

We are analyzing 1 day of taxi rides.

Load data

In [2]:
# load rides dataframe
df = pd.read_csv('/Users/alex/Documents/Transportation_Network_Providers_-_Trips.csv', usecols=['Trip Start Timestamp', 'Trip End Timestamp', 'Trip Seconds',  'Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract', 'Pickup Community Area', 'Dropoff Community Area', 'Fare', 'Tip', 'Additional Charges', 'Trip Total', 'Shared Trip Authorized', 'Trips Pooled', 'Pickup Centroid Latitude', 'Pickup Centroid Longitude', 'Pickup Centroid Location', 'Dropoff Centroid Latitude', 'Dropoff Centroid Longitude', 'Dropoff Centroid Location'], nrows = 300000)
# load city areas geodataframe
chicago = gpd.read_file('/Users/alex/Documents/Boundaries - Community Areas (current).geojson')
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 20 columns):
Trip Start Timestamp          300000 non-null object
Trip End Timestamp            300000 non-null object
Trip Seconds                  300000 non-null int64
Trip Miles                    300000 non-null float64
Pickup Census Tract           210676 non-null float64
Dropoff Census Tract          209071 non-null float64
Pickup Community Area         281992 non-null float64
Dropoff Community Area        279447 non-null float64
Fare                          300000 non-null float64
Tip                           300000 non-null int64
Additional Charges            300000 non-null float64
Trip Total                    300000 non-null float64
Shared Trip Authorized        300000 non-null bool
Trips Pooled                  300000 non-null int64
Pickup Centroid Latitude      282149 non-null float64
Pickup Centroid Longitude     282149 non-null float64
Pickup Centroid Location      282149 non-null object
Dropoff Centroid Latitude     279606 non-null float64
Dropoff Centroid Longitude    279606 non-null float64
Dropoff Centroid Location     279606 non-null object
dtypes: bool(1), float64(12), int64(3), object(4)
memory usage: 43.8+ MB

Datetime Columns

In [4]:
# Create datetime columns:
df['Trip Start Timestamp'] = df['Trip Start Timestamp'].apply(lambda t: pd.to_datetime(t, format='%m/%d/%Y %I:%M:%S %p'))
df['Trip End Timestamp'] = df['Trip End Timestamp'].apply(lambda t: pd.to_datetime(t, format='%m/%d/%Y %I:%M:%S %p'))

Upload City GeoData (Community Areas)

In [5]:
# load city geodataframe
chicago = gpd.read_file('/Users/alex/Documents/Boundaries - Community Areas (current).geojson')

EDA

Number of Rides per Hour

In [6]:
# create rides per hour during the day
rides_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp').size()).reset_index()
rides_per_hour.columns = ['Timeline', 'Number of trips']
In [7]:
# number of rides per hour visualization
trace = go.Bar(x=rides_per_hour['Timeline'],
               y=rides_per_hour['Number of trips'])

layout = go.Layout(
    title='Number of Rides Per Hour',
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

Median number of miles per trip per hour

In [8]:
# create edian number of miles per ride 
ride_miles_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp')['Trip Miles'].median()).reset_index()
In [9]:
# Median miles per ride visualization
trace = go.Bar(x=ride_miles_per_hour['Trip Start Timestamp'], 
               y=ride_miles_per_hour['Trip Miles'])

layout = go.Layout(
    title='Average Miles per Hour',
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

Median fare per trip

In [10]:
# create median fare per trip by hour
ride_fare_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp').Fare.median()).reset_index()
In [11]:
trace = go.Bar(x=ride_fare_per_hour['Trip Start Timestamp'], 
               y=ride_fare_per_hour.Fare)

layout = go.Layout(
    title='Median fare per trip [USD]',
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

Total Revenue (Fare + Additional charges)

In [12]:
#total fare per day
total_fare_per_day = df.Fare.sum()
total_fare_per_day
Out[12]:
3367032.5
In [13]:
# total additional charges per day
total_add_charges_per_day = df['Additional Charges'].sum()
total_add_charges_per_day.round(2)
Out[13]:
860523.52
In [14]:
# total revenue
total_revenue = (total_fare_per_day + total_add_charges_per_day).round(2)
total_revenue
Out[14]:
4227556.02

Revenue per Hour

In [15]:
# revenue per hour calculation
ride_fare_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp')['Fare'].sum()).reset_index()
ride_charge_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp')['Additional Charges'].sum()).reset_index()
In [16]:
# hourly revenue visualizations
trace1 = go.Bar(
    x=ride_fare_per_hour['Trip Start Timestamp'],
    y=ride_fare_per_hour.Fare,
    name='Fare'
)
trace2 = go.Bar(
    x=ride_charge_per_hour['Trip Start Timestamp'],
    y=ride_charge_per_hour['Additional Charges'],
    name='Additonal Charges'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    title='Revenue per Hour',
)

fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig, filename='stacked-bar')

Rides Velocity

In [17]:
# Average velocity per ride (miles/hour)
df['velocity'] = df['Trip Miles']/(df['Trip Seconds']/3600)
In [18]:
# Median velocity calculation
ride_velocity = pd.DataFrame(df.resample('H', on='Trip Start Timestamp').velocity.median()).reset_index()
In [19]:
# rides velocity visualizations
trace = go.Bar(x=ride_velocity['Trip Start Timestamp'], y=ride_velocity.velocity)

layout = go.Layout(
    title='Median Velocity [Miles/hr]',
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

Revenue per minute

In [20]:
# revenue per minute per ride
df_test = df[df['Trip Seconds']!=0]

df_reven = df_test[df['Fare']!=0]

df_reven['rev_min'] = df_reven.Fare/(df_reven['Trip Seconds']/60).round(2)
In [21]:
df_reven.sort_values(by='rev_min', ascending=False).head(2)
Out[21]:
Trip Start Timestamp Trip End Timestamp Trip Seconds Trip Miles Pickup Census Tract Dropoff Census Tract Pickup Community Area Dropoff Community Area Fare Tip ... Shared Trip Authorized Trips Pooled Pickup Centroid Latitude Pickup Centroid Longitude Pickup Centroid Location Dropoff Centroid Latitude Dropoff Centroid Longitude Dropoff Centroid Location velocity rev_min
207349 2018-11-01 18:45:00 2018-11-01 18:45:00 12 20.5 1.703108e+10 1.703198e+10 8.0 76.0 42.5 2 ... False 1 41.893216 -87.637844 POINT (-87.6378442095 41.8932163595) 41.979071 -87.903040 POINT (-87.9030396611 41.9790708201) 6150.0 212.500000
115434 2018-11-01 12:45:00 2018-11-01 12:45:00 4 0.0 1.703184e+10 1.703184e+10 28.0 28.0 12.5 0 ... True 1 41.884768 -87.684147 POINT (-87.6841474493 41.8847677845) 41.884768 -87.684147 POINT (-87.6841474493 41.8847677845) 0.0 178.571429

2 rows × 22 columns

Some inconsistency in data with large amumnt of revenue per minute. It appears that in some cases we have situation where Trip Seconds is 12 and Trip Miles is 20 which is impossible. Lets increase Trip Seconds threshold to 240 seconds.

In [22]:
# revenue per minute per ride
df_test = df[df['Trip Seconds']>=240]

df_reven = df_test[df['Fare']!=0]

df_reven['rev_min'] = df_reven.Fare/(df_reven['Trip Seconds']/60).round(2)
Basic stats: revenue per ride minute
In [23]:
# revenue per minute per ride distribution visualization
fig = go.Figure()
fig.add_trace(go.Box(y=df_reven['rev_min']))

fig.show()

Rides destinations

Hete I will try to figure out what trips are common in terms of city areas:

  • Trips within one city area
  • Trips between different city areas
  • Trips from city to out of city areas
  • Trips from out of city ares to city area
  • Out of city trips
In [24]:
# filter our different kinds of trips
# trips from outside the city to outside city areas
null_rides = df[df['Dropoff Community Area'].isnull() & df['Pickup Community Area'].isnull()]

# trips from city areas to outside the city areas
null_dropoff = df[df['Dropoff Community Area'].isnull() & df['Pickup Community Area'].notnull()]

# trips from outside the city ares to city areas
null_pickup = df[df['Pickup Community Area'].isnull() & df['Dropoff Community Area'].notnull()]

# trips inside one area of the city
same_area = df[df['Dropoff Community Area']==df['Pickup Community Area']]

# trips between different areas of the city
differnt_area = df[df['Dropoff Community Area']!=df['Pickup Community Area']]

city_dif_area = differnt_area[~differnt_area.index.isin(null_dropoff.index)]

city_dif_area = city_dif_area[~city_dif_area.index.isin(null_pickup.index)]

city_dif_area = city_dif_area[~city_dif_area.index.isin(null_rides.index)]

same_city_area = same_area[~same_area.index.isin(null_rides.index)]
In [25]:
# trips kind visualization
labels = ['Rides without area destinations','Rides without dropoff','Rides without pickups','Rides between city areas', 'Rides within same city areas']
values = [len(null_rides),len(null_dropoff),len(null_pickup),len(city_dif_area), len(same_city_area)]
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1', '#b3fec1']

trace = go.Pie(labels=labels, values=values,
               hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=20),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2)))

pyo.iplot([trace], filename='styled_pie_chart')

Trips Distributions

In [26]:
# different kind of trips hourly distributions
null_pickup_time = pd.DataFrame(null_pickup.resample('H', on='Trip Start Timestamp').size()).reset_index()
null_pickup_time.columns = ['Trip Start Timestamp', 'Count']

null_dropoff_time = pd.DataFrame(null_dropoff.resample('H', on='Trip Start Timestamp').size()).reset_index()
null_dropoff_time.columns = ['Trip Start Timestamp', 'Count']

same_city_area_time = pd.DataFrame(same_city_area.resample('H', on='Trip Start Timestamp').size()).reset_index()
same_city_area_time.columns = ['Trip Start Timestamp', 'Count']

city_dif_area_time = pd.DataFrame(city_dif_area.resample('H', on='Trip Start Timestamp').size()).reset_index()
city_dif_area_time.columns = ['Trip Start Timestamp', 'Count']
In [27]:
# distribution visualizations
fig = make_subplots(rows=4, cols=1)
trace1 = go.Bar(x=null_pickup_time['Trip Start Timestamp'], y=null_pickup_time['Count'])

trace2 = go.Bar(x=null_dropoff_time['Trip Start Timestamp'], y=null_dropoff_time['Count'])

trace3 = go.Bar(x=same_city_area_time['Trip Start Timestamp'], y=same_city_area_time['Count'])

trace4 = go.Bar(x=city_dif_area_time['Trip Start Timestamp'], y=city_dif_area_time['Count'])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 3, 1)
fig.append_trace(trace4, 4, 1)

fig.update_layout = go.Layout(
    title='Rides without Pickup Data',
    autosize=False,
    width=500,
    height=1000,
)

#fig = go.Figure(data=data, layout=layout)
fig.show()

We have different distributions with outside area trips

Choropleth Map

In [28]:
# Number or pickups per city area
pickup_per_area = pd.DataFrame(df.groupby('Pickup Community Area').size()).reset_index()

pickup_per_area['community_area'] = pickup_per_area['Pickup Community Area'].astype(int)
pickup_per_area = pickup_per_area.drop(columns='Pickup Community Area')
pickup_per_area.columns = ['pickup_number', 'community_area']
In [29]:
# Number of dropoffs per city area
dropoff_per_area = pd.DataFrame(df.groupby('Dropoff Community Area').size()).reset_index()

dropoff_per_area['community_area'] = dropoff_per_area['Dropoff Community Area'].astype(int)
dropoff_per_area = dropoff_per_area.drop(columns='Dropoff Community Area')
dropoff_per_area.columns = ['dropoff_number', 'community_area']
In [30]:
# create city pickup and dropoff geo datapoints
chicago['community_area'] = chicago['area_num_1'].astype(int)

chicago = pd.merge(chicago, pickup_per_area, how='left', on='community_area')

chicago = pd.merge(chicago, dropoff_per_area, how='left', on='community_area')
In [31]:
# city coordinates
chicago_center = [41.8781, -87.6298]

pickup_map = folium.Map(location = chicago_center, zoom_start = 10)
In [32]:
# add points pickup points to city map
folium.Choropleth(
    geo_data=chicago,
    name='geometry',
    data=chicago,
    columns=['area_numbe', 'pickup_number'],
    key_on='feature.properties.area_numbe',
    fill_color='YlGn',
    fill_opacity=0.8,
    line_opacity=0.5,
    legend_name='Pickup dencity'
).add_to(pickup_map)
Out[32]:
<folium.features.Choropleth at 0x1c246724a8>
In [33]:
folium.LayerControl().add_to(pickup_map)
Out[33]:
<folium.map.LayerControl at 0x1c2297c518>
In [34]:
display(pickup_map)

Pickup Choropleth

In [35]:
# setup quantile range for choropleth
bins = list(chicago['pickup_number'].quantile([0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))
In [36]:
# city coordinates
chicago_center = [41.8781, -87.6298]

pickup_map2 = folium.Map(location = chicago_center, zoom_start = 11)
In [37]:
chicago_pickup = chicago
In [38]:
folium.Choropleth(
    geo_data=chicago_pickup,
    name='geometry',
    data=chicago_pickup,
    columns=['area_numbe', 'pickup_number'],
    key_on='feature.properties.area_numbe',
    fill_color='BuPu',
    fill_opacity=0.8,
    line_opacity=0.5,
    legend_name='Pickup dencity',
    bins=bins,
    reset=True
).add_to(pickup_map2)
Out[38]:
<folium.features.Choropleth at 0x1c244ea1d0>
In [39]:
folium.LayerControl().add_to(pickup_map2)
Out[39]:
<folium.map.LayerControl at 0x1c24677780>
In [40]:
chicago_pickup['center'] = chicago_pickup.geometry.centroid
In [41]:
# adding popups with pickup information
for row in chicago_pickup.iterrows():
    row_values = row[1] 
    center_point = row_values['center']
    location = [center_point.y, center_point.x]
    popup = ('Community area: ' + str(row_values['community_area']) + '-' + str(row_values['community']) + 
             '; ' + 'Pickups: ' + str(row_values['pickup_number']))
    marker = folium.Marker(location = location, popup = popup)
    marker.add_to(pickup_map2)
In [42]:
display(pickup_map2)

Dropoff Coropleth

In [43]:
chicago_dropoff = chicago
In [44]:
# quantile range for dropoff choropleth
bins = list(chicago_dropoff['dropoff_number'].quantile([0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))
In [45]:
#city coordinares
chicago_center = [41.8781, -87.6298]

dropoff_map = folium.Map(location = chicago_center, zoom_start = 11)
In [46]:
chicago_dropoff.drop(columns ='center', inplace=True)
In [47]:
folium.Choropleth(
    geo_data=chicago_dropoff,
    name='geometry',
    data=chicago_dropoff,
    columns=['area_numbe', 'dropoff_number'],
    key_on='feature.properties.area_numbe',
    fill_color='YlGn',
    fill_opacity=0.8,
    line_opacity=0.5,
    legend_name='Dropoff dencity',
    bins=bins,
    reset=True
).add_to(dropoff_map)
Out[47]:
<folium.features.Choropleth at 0x1c224ec438>
In [48]:
folium.LayerControl().add_to(dropoff_map)
Out[48]:
<folium.map.LayerControl at 0x1c2686f4a8>
In [49]:
chicago_dropoff['center'] = chicago_dropoff.geometry.centroid
In [50]:
#adding popups: number of dropoffs per city ares
for row in chicago_dropoff.iterrows():
    row_values = row[1] 
    center_point = row_values['center']
    location = [center_point.y, center_point.x]
    popup = ('Community area: ' + str(row_values['community_area']) + '-' + str(row_values['community']) + 
             '; ' + 'Pickups: ' + str(row_values['pickup_number']))
    marker = folium.Marker(location = location, popup = popup)
    marker.add_to(dropoff_map)
In [51]:
display(dropoff_map)

Matplotlib Choropleths

In [52]:
# matplotlib map to compare visualizations: quantiles and equal intevals
chicago.plot(column='pickup_number', scheme='quantiles', k=10, legend=True, figsize=(20,10))
chicago.plot(column='pickup_number', scheme='equal_interval', k=10, legend=True, figsize=(20,10))
plt.show()

Create City Grid

In [53]:
# create new dataframe
locationsdf = df[['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles', 
                  'Trip Total', 'Pickup Centroid Location', 'Dropoff Centroid Location']]
In [54]:
locationsdf = locationsdf.dropna()
pickup_df = locationsdf[['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles', 
                         'Trip Total', 'Pickup Centroid Location']]
dropoff_df = locationsdf[['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles', 
                          'Trip Total', 'Dropoff Centroid Location']]
In [55]:
pickup_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 261891 entries, 2 to 299999
Data columns (total 5 columns):
Trip Start Timestamp        261891 non-null datetime64[ns]
Trip Seconds                261891 non-null int64
Trip Miles                  261891 non-null float64
Trip Total                  261891 non-null float64
Pickup Centroid Location    261891 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 12.0+ MB
In [56]:
dropoff_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 261891 entries, 2 to 299999
Data columns (total 5 columns):
Trip Start Timestamp         261891 non-null datetime64[ns]
Trip Seconds                 261891 non-null int64
Trip Miles                   261891 non-null float64
Trip Total                   261891 non-null float64
Dropoff Centroid Location    261891 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 12.0+ MB
In [57]:
# create pickup geodataframe with pickup geopoints
geometry = pickup_df['Pickup Centroid Location'].map(shapely.wkt.loads)
pickup_df = pickup_df.drop('Pickup Centroid Location', axis=1)
crs = {'init': 'epsg:4326'}
pickup_gdf = gpd.GeoDataFrame(pickup_df, crs=crs, geometry=geometry)
In [58]:
# create dropoff geodataframe with dropoff geopoints
geometry = dropoff_df['Dropoff Centroid Location'].map(shapely.wkt.loads)
dropoff_df = dropoff_df.drop('Dropoff Centroid Location', axis=1)
crs = {'init': 'epsg:4326'}
dropoff_gdf = gpd.GeoDataFrame(dropoff_df, crs=crs, geometry=geometry)
In [59]:
# Extract max min coordinates
xmin,ymin,xmax,ymax =  pickup_gdf.total_bounds
print(xmin,ymin,xmax,ymax)
-87.913624596 41.6502216756 -87.5349029012 42.0212235931
In [60]:
# city polygon grid
width = 1/300
height = 1/300

cols = np.array(np.arange(xmin, xmax, width))
rows = np.array(np.arange(ymin, ymax, height))


polygons = []
for x in cols:
    for y in rows:
        polygons.append( Polygon([(x,y), (x+width, y), (x+width, y-height), (x, y-height)]) )

grid = gpd.GeoDataFrame({'geometry':polygons})
In [61]:
grid.head()
Out[61]:
geometry
0 POLYGON ((-87.91362459600001 41.6502216756, -8...
1 POLYGON ((-87.91362459600001 41.65355500893333...
2 POLYGON ((-87.91362459600001 41.65688834226666...
3 POLYGON ((-87.91362459600001 41.66022167559999...
4 POLYGON ((-87.91362459600001 41.66355500893332...
In [62]:
# set grid crs: epsg:4326
grid.crs = {'init': 'epsg:4326'}
print(grid.crs)
{'init': 'epsg:4326'}

Adding Grids to Dropoffs

In [63]:
# adding dropoffs to city grid
dropoff_gridaera = gpd.sjoin(dropoff_gdf, grid, op = "within")
In [64]:
dropoff_gridaera.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 260426 entries, 2 to 290317
Data columns (total 6 columns):
Trip Start Timestamp    260426 non-null datetime64[ns]
Trip Seconds            260426 non-null int64
Trip Miles              260426 non-null float64
Trip Total              260426 non-null float64
geometry                260426 non-null object
index_right             260426 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 13.9+ MB
In [65]:
dropoff_gridaera.tail(5)  #gris number as ndex_right
Out[65]:
Trip Start Timestamp Trip Seconds Trip Miles Trip Total geometry index_right
280992 2018-11-01 23:15:00 2212 9.8 10.0 POINT (-87.780270163 41.9201527839) 4561
283310 2018-11-01 23:30:00 806 5.4 7.5 POINT (-87.5399158012 41.7136552552) 12564
288565 2018-11-02 00:00:00 1513 8.4 12.5 POINT (-87.76142159370001 41.9297962382) 5124
289705 2018-11-02 00:00:00 1625 11.6 22.5 POINT (-87.76142159370001 41.9297962382) 5124
290317 2018-11-02 00:15:00 1339 8.1 10.0 POINT (-87.76142159370001 41.9297962382) 5124
In [66]:
# getting numner of dropoffs per grid cell
dropoff_gridnum = pd.DataFrame(dropoff_gridaera.groupby('index_right').size())

dropoff_gridnum.columns = ['number_dropoffs']

dropoff_gridnum.head()
Out[66]:
number_dropoffs
index_right
435 5881
443 2
1220 137
2338 5
2452 221
In [67]:
# join grid df and number of dropoffs
grid_dropoffs = grid.join(dropoff_gridnum, how='outer')

# drop rows with empty grid cells
grid_dropoffs = grid_dropoffs.dropna()

grid_dropoffs = grid_dropoffs.reset_index()

grid_dropoffs.head()
Out[67]:
index geometry number_dropoffs
0 435 POLYGON ((-87.90362459600001 41.9802216755997,... 5881.0
1 443 POLYGON ((-87.90362459600001 42.00688834226634... 2.0
2 1220 POLYGON ((-87.8802912626667 41.98355500893303,... 137.0
3 2338 POLYGON ((-87.8469579293334 41.97688834226637,... 5.0
4 2452 POLYGON ((-87.84362459600007 41.98355500893303... 221.0
In [68]:
# grid dropoff dencity visualization
grid_dropoffs.plot(column='number_dropoffs', scheme='quantiles', k=10, legend=True, figsize=(20,10))
plt.show()
In [69]:
# number of active grid cells
print(len(grid_dropoffs))
798

Grid Dropoff Choropleth

In [70]:
# create quantile range
bins = list(grid_dropoffs['number_dropoffs'].quantile([0, 0.8, 0.85, 0.89, 0.95, 0.97, 0.98, 0.99, 0.998, 1]))
In [71]:
# city coordinates
chicago_center = [41.8781, -87.6298]

dropoff_grid_map = folium.Map(location = chicago_center, zoom_start = 12)
In [72]:
folium.Choropleth(
    geo_data=grid_dropoffs,
    name='geometry',
    data=grid_dropoffs,
    columns= ['index', 'number_dropoffs'],
    key_on='feature.properties.index',
    fill_color='YlGn',
    fill_opacity=0.8,
    line_opacity=0.5,
    legend_name='Dropoff dencity',
    bins=bins,
    reset=True
).add_to(dropoff_grid_map)
Out[72]:
<folium.features.Choropleth at 0x1c47081668>
In [73]:
folium.LayerControl().add_to(dropoff_grid_map)
Out[73]:
<folium.map.LayerControl at 0x1c469f3cc0>
In [74]:
display(dropoff_grid_map)

Adding Famous Places

In [75]:
# Load places dataframe
places = pd.read_csv('/Users/alex/Library/Mobile Documents/com~apple~CloudDocs/Datasets/Places_chicago.csv')

places.head()
Out[75]:
Place Coordinates
0 Millennium Park 41.882600 -87.622600
1 The Art Institute of Chicago 41.879600 -87.623700
2 Lincoln Park Zoo 41.921100 -87.633500
3 Garfield Park Conservatory 41.886300 -87.717300
4 Wrigley Field 41.948400 -87.655300
In [76]:
# create latitude and longitude cols
places = places.join(places.Coordinates.str.split(expand=True))
places = places.drop('Coordinates', axis=1)

places.columns = ['places', 'lat', 'lng']

places.lat = places.lat.astype(float)
places.lng = places.lng.astype(float)

places.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 3 columns):
places    23 non-null object
lat       23 non-null float64
lng       23 non-null float64
dtypes: float64(2), object(1)
memory usage: 632.0+ bytes
  • add popular places
  • Analysis of night rides
In [77]:
# create geometry column
geometry = [Point(xy) for xy in zip (places.lng, places.lat)]
# create geodataframe with famous places
places_gdf = gpd.GeoDataFrame(places, geometry=geometry)

places_gdf.head()
Out[77]:
places lat lng geometry
0 Millennium Park 41.8826 -87.6226 POINT (-87.62260000000001 41.8826)
1 The Art Institute of Chicago 41.8796 -87.6237 POINT (-87.6237 41.8796)
2 Lincoln Park Zoo 41.9211 -87.6335 POINT (-87.6335 41.9211)
3 Garfield Park Conservatory 41.8863 -87.7173 POINT (-87.71729999999999 41.8863)
4 Wrigley Field 41.9484 -87.6553 POINT (-87.6553 41.9484)
In [78]:
# adding famous places popups to choropleth
for row in places_gdf.iterrows():
    row_values = row[1] 
    center_point = row_values['geometry']
    location = [center_point.y, center_point.x]
    popup = (str(row_values['places']))
    marker = folium.Marker(location = location, popup = popup)
    marker.add_to(dropoff_grid_map)
In [79]:
display(dropoff_grid_map)
In [80]:
# Number of grid cells with dropoffs
df['Dropoff Centroid Location'].nunique()
Out[80]:
828

Grid cells with more than 2000 dropoffs per day

In [81]:
grid_dropoffs_max = grid_dropoffs[grid_dropoffs.number_dropoffs > 2000]
In [82]:
grid_dropoffs_max.sort_values(by = 'number_dropoffs', ascending=False).head()
Out[82]:
index geometry number_dropoffs
625 9478 POLYGON ((-87.63362459600026 41.88355500893312... 15447.0
666 9815 POLYGON ((-87.62362459600027 41.88688834226645... 8042.0
0 435 POLYGON ((-87.90362459600001 41.9802216755997,... 5881.0
495 8583 POLYGON ((-87.6602912626669 41.88688834226645,... 5679.0
626 9481 POLYGON ((-87.63362459600026 41.89355500893311... 5384.0

Origin-Destination Matrix (ODM)

In [83]:
# drop columns from main df
df_odm = df.drop(['Trip End Timestamp', 'Pickup Census Tract', 'Dropoff Census Tract', 'Pickup Community Area', 'Dropoff Community Area',
'Shared Trip Authorized', 'Trips Pooled', 'Pickup Centroid Latitude', 'Pickup Centroid Longitude', 'Dropoff Centroid Latitude',
'Dropoff Centroid Longitude'], axis=1)

# filter out trips inside the city
df_odm_ready = df_odm[df_odm['Pickup Centroid Location'].notnull() & df_odm['Dropoff Centroid Location'].notnull()]

# create df copy for dropoff data
df_odm_ready2 = df_odm_ready.copy()
In [84]:
# create pickup goedataframe
geometry_p = df_odm_ready['Pickup Centroid Location'].map(shapely.wkt.loads)
crs = {'init': 'epsg:4326'}
df_odm_pick = gpd.GeoDataFrame(df_odm_ready, crs=crs, geometry=geometry_p).copy() # NOTE changes main df
In [85]:
# create dropoff geodataframe
geometry_d = df_odm_ready2['Dropoff Centroid Location'].map(shapely.wkt.loads)
crs = {'init': 'epsg:4326'}
df_odm_drop = gpd.GeoDataFrame(df_odm_ready, crs=crs, geometry=geometry_d)

Create Grid for ODM

In [86]:
# create grids max/min coordinates
xmin,ymin,xmax,ymax =  df_odm_pick.total_bounds
print(xmin,ymin,xmax,ymax)
-87.913624596 41.6502216756 -87.5349029012 42.0212235931
In [87]:
# create grid
width = 1/30
height = 1/30

cols = np.array(np.arange(xmin, xmax, width))
rows = np.array(np.arange(ymin, ymax, height))


polygons = []
for x in cols:
    for y in rows:
        polygons.append( Polygon([(x,y), (x+width, y), (x+width, y-height), (x, y-height)]) )

grid_odm = gpd.GeoDataFrame({'geometry':polygons})
In [88]:
grid_odm.shape
Out[88]:
(144, 1)

Adjust CRS

In [89]:
# adjust crs for the grid
grid_odm.crs = {'init': 'epsg:4326'}
print(df_odm_pick.crs)
print(df_odm_drop.crs)
{'init': 'epsg:4326'}
{'init': 'epsg:4326'}

SJOIN Grid Cells

In [90]:
# add grid cell index to pickups and dropoffs data
df_odm_pick_grid = gpd.sjoin(df_odm_pick, grid_odm, op = "within")

df_odm_drop_grid = gpd.sjoin(df_odm_drop, grid_odm, op = "within").copy()
In [91]:
df_odm_pick_grid.head(2)
Out[91]:
Trip Start Timestamp Trip Seconds Trip Miles Fare Tip Additional Charges Trip Total Pickup Centroid Location Dropoff Centroid Location velocity geometry index_right
2 2018-11-01 558 2.6 7.5 0 2.5 10.0 POINT (-87.6540070286 41.9147473049) POINT (-87.6318639497 41.8920421365) 16.774194 POINT (-87.65400702860001 41.9147473049) 92
28 2018-11-01 832 4.2 10.0 0 2.5 12.5 POINT (-87.6572331997 41.8852813201) POINT (-87.6467820813 41.938232293) 18.173077 POINT (-87.6572331997 41.8852813201) 92
In [92]:
df_odm_drop_grid.head(2)
Out[92]:
Trip Start Timestamp Trip Seconds Trip Miles Fare Tip Additional Charges Trip Total Pickup Centroid Location Dropoff Centroid Location velocity geometry index_right
2 2018-11-01 558 2.6 7.5 0 2.5 10.0 POINT (-87.6540070286 41.9147473049) POINT (-87.6318639497 41.8920421365) 16.774194 POINT (-87.6318639497 41.8920421365) 104
13 2018-11-01 303 1.0 5.0 0 2.5 7.5 POINT (-87.6378442095 41.8932163595) POINT (-87.6321092196 41.9002656868) 11.881188 POINT (-87.6321092196 41.9002656868) 104

Join by index

In [93]:
df_odm_drop_grid = df_odm_drop_grid.drop(['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles', 'Fare', 'Tip',
       'Additional Charges', 'Trip Total', 'Pickup Centroid Location',
       'Dropoff Centroid Location', 'velocity'], axis=1)

df_omd_grid = pd.merge(df_odm_pick_grid, df_odm_drop_grid, right_index=True, left_index=True)

df_omd_grid = df_omd_grid.drop(['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles', 'Fare', 'Tip',
       'Additional Charges', 'Trip Total', 'Pickup Centroid Location',
       'Dropoff Centroid Location', 'velocity', 'geometry_x', 'geometry_y'], axis=1)
In [94]:
# add ride value
df_omd_grid['value'] = 1

df_omd_grid.head()
Out[94]:
index_right_x index_right_y value
2 92 104 1
28 92 105 1
31 92 81 1
33 92 94 1
36 92 104 1
In [95]:
df_omd_grid.shape
Out[95]:
(259828, 3)
In [96]:
# create origin-destination matrix
odm_matrix = df_omd_grid.pivot_table(index='index_right_x', columns='index_right_y', values='value', aggfunc=np.sum).fillna(0)

print(odm_matrix.shape)
(73, 72)